Variable descriptions were obtained from King County, Department of Assessments. All feature engineering should be done in the first code chunks of your document.
housedata <- read.csv("~/git_repositories/STT3851ClassRepo/Rmarkdown/Data/housedata.csv",
colClasses = c(id = "character", date = "character",
yr_built = "character", zipcode = "factor", grade = "factor"))
housedata$date <- as.Date(housedata$date, "%Y%m%d")
housedata$waterfront <- factor(housedata$waterfront, labels = c("No", "Yes"))
housedata$condition <- factor(housedata$condition, labels = c("poor", "fair", "average", "good", "very good"))
housedata$yr_renovated <- ifelse(housedata$yr_renovated == 0, housedata$yr_built, housedata$yr_renovated)
housedata$yr_built <- as.Date(ISOdate(housedata$yr_built, 9, 1)) # Complete Year, Sept 1
housedata$yr_renovated <- as.Date(ISOdate(housedata$yr_renovated, 9, 1)) # Last renovated Year, Sept 1
housedata <- housedata[, -1]
#### Perform same steps with test set
housedataT <- read.csv("~/git_repositories/STT3851ClassRepo/Rmarkdown/Data/housedataTEST.csv",
colClasses = c(id = "character", date = "character",
yr_built = "character", zipcode = "factor", grade = "factor"))
housedataT$date <- as.Date(housedataT$date, "%Y%m%d")
housedataT$waterfront <- factor(housedataT$waterfront, labels = c("No", "Yes"))
housedataT$condition <- factor(housedataT$condition, labels = c("poor", "fair", "average", "good", "very good"))
housedataT$yr_renovated <- ifelse(housedataT$yr_renovated == 0, housedataT$yr_built, housedataT$yr_renovated)
housedataT$yr_built <- as.Date(ISOdate(housedataT$yr_built, 9, 1)) # Complete Year, Sept 1
housedataT$yr_renovated <- as.Date(ISOdate(housedataT$yr_renovated, 9, 1)) # Last renovated Year, Sept 1
housedataT <- housedataT[, -1]
library(DT)
datatable(housedata[, 12:20], rownames = FALSE)
Consider predicting the price (price) of a house based on a certain feature (sqft_living). Start by graphing the relationship.
library(ggplot2)
p1 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
geom_point() +
theme_bw()
p1
Overplotting is problematic. What should we do?
alpha).alphap2 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
geom_point(alpha = 0.05, color = "blue") +
theme_bw()
p2
p3 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_bin2d(bins = 50) +
theme_bw()
p3
p4 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_bin2d(bins = 50) +
scale_fill_gradient(low = "lightblue", high = "red",
limits = c(0, 1000)) +
theme_bw()
p4
p5 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_binhex(bins = 50) +
scale_fill_gradient(low = "lightblue", high = "red",
limits = c(0, 800), breaks = seq(0, 800, by = 200)) +
theme_bw()
p5
**Note* For both stat_bin2d and stat_binhex, if you manually specify the range, and there is a bin that falls outside that range because it has too many or too few points, that bin will show up as grey rather than the color at the high or low end of the range. Observe the gray hexagons in the lower left corner of the above graph.
p6 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_binhex(bins = 50) +
scale_fill_gradient(low = "lightblue", high = "red",
limits = c(0, 1000), breaks = seq(0, 1000, by = 200)) +
theme_bw()
p6
library(car) # red line affected by outlier, green ignoring the outlier (robust)
scatterplot(x =housedata$price, y=housedata$bedrooms) #( not much dependent)
scatterplot(x =housedata$bedrooms, y=housedata$bathrooms) #( dependent excluding the 33 outlier)
scatterplot(x =housedata$price, y=housedata$bathrooms) #( dependent)
scatterplot(x =housedata$price, y=housedata$sqft_lot) #( not dependent)
scatterplot(x =housedata$price, y=housedata$view) #( dependent but mostly view is 0 so not dependent)
scatterplot(x =housedata$price, y=housedata$grade) #( dependent)
scatterplot(x =housedata$price, y=housedata$floors) #( not dependent from boxplot)
scatterplot(x =housedata$price, y=housedata$condition) #( nearly not dependent)
scatterplot(x =housedata$price, y=housedata$waterfront) #( not dependent as nearly no waterfront)
scatterplot(x =housedata$price, y=housedata$bedrooms) #( not much dependent)
scatterplot(x =housedata$price, y=housedata$sqft_above) #( dependent)
scatterplot(x =housedata$price, y=housedata$sqft_basement) #( dependent)
#scatterplot(x =housedata$price, y=housedata$age) #( low negative dependent)
scatterplot(x =housedata$price, y=housedata$zipcode)
#scatterplot(x =housedata$price, y=housedata$renage) #( not dependent as very less houses are renovated)
# scatterplot using ggplot
ggplot(data = housedata, mapping = aes(x = sqft_living, y = price)) + geom_point(colour = 'skyblue') + geom_smooth(method = 'lm')
# creating a data frame excluding id,date,sqft_living15,sqft_lot15, lat,long,yr_built,yr_renovated,rate
housedata1<- housedata[ ,c(2:7,9,12,13,17:20)]
# Correlation matric
cor(housedata1)
price bedrooms bathrooms sqft_living sqft_lot
price 1.00000000 0.31284286 0.52334477 0.70291635 0.088238107
bedrooms 0.31284286 1.00000000 0.52923162 0.59105983 0.030179053
bathrooms 0.52334477 0.52923162 1.00000000 0.75455302 0.082139581
sqft_living 0.70291635 0.59105983 0.75455302 1.00000000 0.166967283
sqft_lot 0.08823811 0.03017905 0.08213958 0.16696728 1.000000000
floors 0.25235756 0.18028523 0.50066694 0.35267511 -0.002951851
view 0.39102268 0.07884375 0.18312596 0.27981310 0.069978368
sqft_above 0.60527752 0.49174312 0.68455295 0.87631944 0.176005462
sqft_basement 0.33122956 0.31056084 0.29077234 0.44288611 0.018691884
lat 0.30948443 -0.01002422 0.02676418 0.05693083 -0.085417697
long 0.02131272 0.13604729 0.22151426 0.23737409 0.225347502
sqft_living15 0.58348082 0.40306677 0.56816564 0.75627424 0.147707827
sqft_lot15 0.08080643 0.02784234 0.08467962 0.17830644 0.727774079
floors view sqft_above sqft_basement
price 0.252357558 0.391022681 0.605277522 0.33122956
bedrooms 0.180285231 0.078843754 0.491743119 0.31056084
bathrooms 0.500666944 0.183125959 0.684552945 0.29077234
sqft_living 0.352675112 0.279813103 0.876319445 0.44288611
sqft_lot -0.002951851 0.069978368 0.176005462 0.01869188
floors 1.000000000 0.026258735 0.522710921 -0.24145116
view 0.026258735 1.000000000 0.163954243 0.27514730
sqft_above 0.522710921 0.163954243 1.000000000 -0.04379916
sqft_basement -0.241451164 0.275147303 -0.043799158 1.00000000
lat 0.049952734 0.008885553 0.001422037 0.11541978
long 0.125918561 -0.076033506 0.341128260 -0.14261452
sqft_living15 0.280417017 0.279937620 0.732554007 0.20500421
sqft_lot15 -0.007389463 0.068809179 0.188503973 0.01894595
lat long sqft_living15 sqft_lot15
price 0.309484427 0.02131272 0.58348082 0.080806426
bedrooms -0.010024220 0.13604729 0.40306677 0.027842339
bathrooms 0.026764178 0.22151426 0.56816564 0.084679619
sqft_living 0.056930825 0.23737409 0.75627424 0.178306444
sqft_lot -0.085417697 0.22534750 0.14770783 0.727774079
floors 0.049952734 0.12591856 0.28041702 -0.007389463
view 0.008885553 -0.07603351 0.27993762 0.068809179
sqft_above 0.001422037 0.34112826 0.73255401 0.188503973
sqft_basement 0.115419784 -0.14261452 0.20500421 0.018945951
lat 1.000000000 -0.13331116 0.04993821 -0.089826111
long -0.133311159 1.00000000 0.33399885 0.253890095
sqft_living15 0.049938206 0.33399885 1.00000000 0.184561578
sqft_lot15 -0.089826111 0.25389009 0.18456158 1.000000000
# Corrplot
library(corrplot)
corrplot(cor(housedata1))
# converting catagorical variables from numeric variables ( bedroom, bathroom, grade ,zipcode)
#housedata$bedrooms <- as.factor(housedata$bedrooms)
#housedata$bathrooms <- as.factor(housedata$bathrooms)
#housedata$grade <- as.factor(housedata$grade)
#housedata$zipcode <- as.factor(housedata$zipcode)
# Checking structure now
str(housedata)
'data.frame': 17384 obs. of 20 variables:
$ date : Date, format: "2014-10-13" "2014-12-09" ...
$ price : num 221900 538000 180000 604000 510000 ...
$ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
$ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
$ sqft_living : int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
$ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
$ floors : num 1 2 1 1 1 1 2 1 1 2 ...
$ waterfront : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ view : int 0 0 0 0 0 0 0 0 0 0 ...
$ condition : Factor w/ 5 levels "poor","fair",..: 3 3 3 5 3 3 3 3 3 3 ...
$ grade : Factor w/ 12 levels "1","10","11",..: 10 10 9 10 11 3 10 10 10 10 ...
$ sqft_above : int 1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
$ sqft_basement: int 0 400 0 910 0 1530 0 0 730 0 ...
$ yr_built : Date, format: "1955-09-01" "1951-09-01" ...
$ yr_renovated : Date, format: "1955-09-01" "1991-09-01" ...
$ zipcode : Factor w/ 70 levels "98001","98002",..: 67 56 17 59 38 30 3 69 61 24 ...
$ lat : num 47.5 47.7 47.7 47.5 47.6 ...
$ long : num -122 -122 -122 -122 -122 ...
$ sqft_living15: int 1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
$ sqft_lot15 : int 5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
model1 <- lm(price~ sqft_living + bedrooms + bathrooms + grade + sqft_above,data = housedata)
summary(model1)
Call:
lm(formula = price ~ sqft_living + bedrooms + bathrooms + grade +
sqft_above, data = housedata)
Residuals:
Min 1Q Median 3Q Max
-1628606 -123162 -25774 90436 4581884
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.987e+04 2.348e+05 0.425 0.670595
sqft_living 2.427e+02 4.932e+00 49.205 < 2e-16 ***
bedrooms -2.629e+04 2.576e+03 -10.205 < 2e-16 ***
bathrooms -3.834e+03 3.745e+03 -1.024 0.306051
grade10 5.383e+05 2.352e+05 2.288 0.022132 *
grade11 8.410e+05 2.356e+05 3.569 0.000359 ***
grade12 1.389e+06 2.369e+05 5.863 4.62e-09 ***
grade13 2.289e+06 2.454e+05 9.327 < 2e-16 ***
grade3 4.637e+04 2.711e+05 0.171 0.864216
grade4 5.160e+04 2.416e+05 0.214 0.830906
grade5 7.285e+04 2.355e+05 0.309 0.757049
grade6 9.235e+04 2.349e+05 0.393 0.694289
grade7 1.228e+05 2.349e+05 0.523 0.601199
grade8 1.915e+05 2.350e+05 0.815 0.415073
grade9 3.354e+05 2.351e+05 1.427 0.153706
sqft_above -9.740e+01 4.719e+00 -20.641 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 234800 on 17368 degrees of freedom
Multiple R-squared: 0.597, Adjusted R-squared: 0.5966
F-statistic: 1715 on 15 and 17368 DF, p-value: < 2.2e-16
anova(model1)
Analysis of Variance Table
Response: price
Df Sum Sq Mean Sq F value Pr(>F)
sqft_living 1 1.1738e+15 1.1738e+15 21292.8959 < 2e-16 ***
bedrooms 1 3.8454e+13 3.8454e+13 697.5391 < 2e-16 ***
bathrooms 1 2.0958e+11 2.0958e+11 3.8017 0.05122 .
grade 11 1.8230e+14 1.6572e+13 300.6138 < 2e-16 ***
sqft_above 1 2.3488e+13 2.3488e+13 426.0662 < 2e-16 ***
Residuals 17368 9.5747e+14 5.5128e+10
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model2 <- lm(price~ sqft_living + bedrooms + bathrooms + grade + sqft_above + zipcode,data = housedata)
summary(model2)
Call:
lm(formula = price ~ sqft_living + bedrooms + bathrooms + grade +
sqft_above + zipcode, data = housedata)
Residuals:
Min 1Q Median 3Q Max
-1553993 -71559 -3707 56624 4046688
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -8.408e+04 1.741e+05 -0.483 0.629206
sqft_living 1.881e+02 3.730e+00 50.419 < 2e-16 ***
bedrooms -2.241e+04 1.932e+03 -11.599 < 2e-16 ***
bathrooms 6.484e+03 2.801e+03 2.315 0.020651 *
grade10 2.927e+05 1.742e+05 1.680 0.092986 .
grade11 5.133e+05 1.745e+05 2.942 0.003269 **
grade12 1.000e+06 1.755e+05 5.699 1.22e-08 ***
grade13 1.802e+06 1.818e+05 9.913 < 2e-16 ***
grade3 1.342e+05 2.006e+05 0.669 0.503654
grade4 7.677e+04 1.787e+05 0.430 0.667555
grade5 8.142e+04 1.744e+05 0.467 0.640510
grade6 5.785e+04 1.740e+05 0.333 0.739460
grade7 5.220e+04 1.740e+05 0.300 0.764118
grade8 6.994e+04 1.740e+05 0.402 0.687712
grade9 1.512e+05 1.741e+05 0.868 0.385212
sqft_above 8.337e-01 3.764e+00 0.222 0.824702
zipcode98002 1.932e+04 1.710e+04 1.130 0.258670
zipcode98003 5.637e+03 1.526e+04 0.370 0.711749
zipcode98004 7.750e+05 1.493e+04 51.892 < 2e-16 ***
zipcode98005 3.028e+05 1.826e+04 16.581 < 2e-16 ***
zipcode98006 2.677e+05 1.347e+04 19.874 < 2e-16 ***
zipcode98007 2.551e+05 1.895e+04 13.463 < 2e-16 ***
zipcode98008 3.186e+05 1.524e+04 20.905 < 2e-16 ***
zipcode98010 6.401e+04 2.162e+04 2.961 0.003074 **
zipcode98011 1.294e+05 1.738e+04 7.448 9.93e-14 ***
zipcode98014 8.334e+04 2.030e+04 4.106 4.05e-05 ***
zipcode98019 8.429e+04 1.750e+04 4.816 1.48e-06 ***
zipcode98022 4.516e+04 1.645e+04 2.745 0.006062 **
zipcode98023 -2.341e+04 1.333e+04 -1.757 0.078989 .
zipcode98024 1.713e+05 2.316e+04 7.397 1.46e-13 ***
zipcode98027 1.576e+05 1.414e+04 11.148 < 2e-16 ***
zipcode98028 1.316e+05 1.541e+04 8.539 < 2e-16 ***
zipcode98029 2.097e+05 1.493e+04 14.050 < 2e-16 ***
zipcode98030 1.881e+03 1.550e+04 0.121 0.903393
zipcode98031 2.128e+04 1.532e+04 1.389 0.164933
zipcode98032 9.784e+03 1.997e+04 0.490 0.624196
zipcode98033 3.721e+05 1.382e+04 26.927 < 2e-16 ***
zipcode98034 2.155e+05 1.303e+04 16.536 < 2e-16 ***
zipcode98038 2.857e+04 1.283e+04 2.228 0.025895 *
zipcode98039 1.178e+06 2.815e+04 41.857 < 2e-16 ***
zipcode98040 5.742e+05 1.593e+04 36.057 < 2e-16 ***
zipcode98042 1.118e+04 1.312e+04 0.852 0.394156
zipcode98045 1.007e+05 1.666e+04 6.043 1.55e-09 ***
zipcode98052 2.394e+05 1.301e+04 18.402 < 2e-16 ***
zipcode98053 1.905e+05 1.405e+04 13.561 < 2e-16 ***
zipcode98055 4.202e+04 1.554e+04 2.704 0.006867 **
zipcode98056 1.073e+05 1.389e+04 7.725 1.18e-14 ***
zipcode98058 3.893e+04 1.352e+04 2.880 0.003985 **
zipcode98059 7.654e+04 1.358e+04 5.636 1.76e-08 ***
zipcode98065 8.202e+04 1.473e+04 5.567 2.63e-08 ***
zipcode98070 1.925e+05 2.054e+04 9.373 < 2e-16 ***
zipcode98072 1.511e+05 1.538e+04 9.826 < 2e-16 ***
zipcode98074 1.710e+05 1.375e+04 12.437 < 2e-16 ***
zipcode98075 1.808e+05 1.443e+04 12.530 < 2e-16 ***
zipcode98077 9.797e+04 1.736e+04 5.644 1.68e-08 ***
zipcode98092 -2.236e+04 1.418e+04 -1.577 0.114805
zipcode98102 5.005e+05 2.248e+04 22.270 < 2e-16 ***
zipcode98103 3.386e+05 1.294e+04 26.158 < 2e-16 ***
zipcode98105 5.094e+05 1.635e+04 31.154 < 2e-16 ***
zipcode98106 1.165e+05 1.479e+04 7.877 3.56e-15 ***
zipcode98107 3.559e+05 1.554e+04 22.900 < 2e-16 ***
zipcode98108 1.175e+05 1.784e+04 6.583 4.75e-11 ***
zipcode98109 5.204e+05 2.097e+04 24.813 < 2e-16 ***
zipcode98112 6.090e+05 1.563e+04 38.957 < 2e-16 ***
zipcode98115 3.472e+05 1.291e+04 26.901 < 2e-16 ***
zipcode98116 3.235e+05 1.475e+04 21.934 < 2e-16 ***
zipcode98117 3.288e+05 1.304e+04 25.211 < 2e-16 ***
zipcode98118 1.765e+05 1.328e+04 13.294 < 2e-16 ***
zipcode98119 5.177e+05 1.741e+04 29.739 < 2e-16 ***
zipcode98122 3.468e+05 1.538e+04 22.548 < 2e-16 ***
zipcode98125 2.281e+05 1.398e+04 16.323 < 2e-16 ***
zipcode98126 2.075e+05 1.475e+04 14.062 < 2e-16 ***
zipcode98133 1.655e+05 1.333e+04 12.417 < 2e-16 ***
zipcode98136 2.929e+05 1.557e+04 18.810 < 2e-16 ***
zipcode98144 3.015e+05 1.454e+04 20.740 < 2e-16 ***
zipcode98146 1.368e+05 1.523e+04 8.982 < 2e-16 ***
zipcode98148 7.603e+04 2.850e+04 2.667 0.007649 **
zipcode98155 1.664e+05 1.381e+04 12.051 < 2e-16 ***
zipcode98166 1.200e+05 1.592e+04 7.540 4.93e-14 ***
zipcode98168 5.026e+04 1.573e+04 3.194 0.001405 **
zipcode98177 2.663e+05 1.573e+04 16.926 < 2e-16 ***
zipcode98178 7.320e+04 1.548e+04 4.730 2.26e-06 ***
zipcode98188 3.155e+04 1.909e+04 1.653 0.098385 .
zipcode98198 5.332e+04 1.527e+04 3.492 0.000481 ***
zipcode98199 3.891e+05 1.490e+04 26.106 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 172600 on 17299 degrees of freedom
Multiple R-squared: 0.7831, Adjusted R-squared: 0.782
F-statistic: 743.5 on 84 and 17299 DF, p-value: < 2.2e-16
anova(model2)
Analysis of Variance Table
Response: price
Df Sum Sq Mean Sq F value Pr(>F)
sqft_living 1 1.1738e+15 1.1738e+15 39406.9608 < 2.2e-16 ***
bedrooms 1 3.8454e+13 3.8454e+13 1290.9421 < 2.2e-16 ***
bathrooms 1 2.0958e+11 2.0958e+11 7.0358 0.007997 **
grade 11 1.8230e+14 1.6572e+13 556.3487 < 2.2e-16 ***
sqft_above 1 2.3488e+13 2.3488e+13 788.5248 < 2.2e-16 ***
zipcode 69 4.4217e+14 6.4083e+12 215.1322 < 2.2e-16 ***
Residuals 17299 5.1530e+14 2.9788e+10
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(randomForest)
model3 <- randomForest(price ~ bathrooms + sqft_above,
data = housedata)
Use a simple linear model to predict the price of a house with 2,500 \(\text{ft}^2\).
slm <- lm(price ~ sqft_living, data = housedata)
summary(slm)
Call:
lm(formula = price ~ sqft_living, data = housedata)
Residuals:
Min 1Q Median 3Q Max
-1490607 -148265 -23758 105710 4349512
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -47116.079 4923.344 -9.57 <2e-16 ***
sqft_living 281.959 2.164 130.29 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 263000 on 17382 degrees of freedom
Multiple R-squared: 0.4941, Adjusted R-squared: 0.4941
F-statistic: 1.698e+04 on 1 and 17382 DF, p-value: < 2.2e-16
predict(slm, newdata = data.frame(sqft_living = 2500))
1
657781
p6 + geom_smooth(method = "lm") +
geom_vline(xintercept = 2500,linetype = "dashed", color = "red") +
geom_hline(yintercept = predict(slm, newdata = data.frame(sqft_living = 2500)), linetype = "dashed", color = "red") +
labs(x = "Living Space (square feet)", y = "Price ($)")
mod.zip most basic model.
mod.zip <- lm(price ~ 1, data = housedata)
summary(mod.zip)
Call:
lm(formula = price ~ 1, data = housedata)
Residuals:
Min 1Q Median 3Q Max
-464367 -219367 -89367 100633 7160633
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 539367 2804 192.4 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 369700 on 17383 degrees of freedom
mod.all all current features except sqft_basement.
mod.all <- lm(price ~ . - sqft_basement, data = housedata)
summary(mod.all)
Call:
lm(formula = price ~ . - sqft_basement, data = housedata)
Residuals:
Min 1Q Median 3Q Max
-1539648 -60120 2700 56110 3478116
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.135e+07 6.410e+06 -6.450 1.15e-10 ***
date 1.145e+02 1.016e+01 11.269 < 2e-16 ***
bedrooms -1.431e+04 1.708e+03 -8.377 < 2e-16 ***
bathrooms 2.394e+04 2.764e+03 8.663 < 2e-16 ***
sqft_living 1.170e+02 3.714e+00 31.495 < 2e-16 ***
sqft_lot 2.574e-01 4.059e-02 6.342 2.32e-10 ***
floors -2.963e+04 3.344e+03 -8.862 < 2e-16 ***
waterfrontYes 6.641e+05 1.465e+04 45.331 < 2e-16 ***
view 4.953e+04 1.828e+03 27.102 < 2e-16 ***
conditionfair 9.809e+04 3.414e+04 2.873 0.004075 **
conditionaverage 1.042e+05 3.174e+04 3.282 0.001032 **
conditiongood 1.318e+05 3.175e+04 4.151 3.32e-05 ***
conditionvery good 1.814e+05 3.194e+04 5.677 1.39e-08 ***
grade10 1.173e+05 1.549e+05 0.757 0.448960
grade11 3.143e+05 1.552e+05 2.025 0.042869 *
grade12 7.525e+05 1.561e+05 4.821 1.44e-06 ***
grade13 1.661e+06 1.615e+05 10.287 < 2e-16 ***
grade3 -7.935e+02 1.776e+05 -0.004 0.996434
grade4 -9.356e+04 1.588e+05 -0.589 0.555737
grade5 -1.035e+05 1.548e+05 -0.668 0.503898
grade6 -1.122e+05 1.547e+05 -0.725 0.468200
grade7 -1.091e+05 1.547e+05 -0.705 0.480711
grade8 -8.819e+04 1.548e+05 -0.570 0.568766
grade9 -1.235e+04 1.548e+05 -0.080 0.936432
sqft_above 5.176e+01 3.858e+00 13.416 < 2e-16 ***
yr_built -3.312e+00 2.798e-01 -11.838 < 2e-16 ***
yr_renovated 2.591e+00 2.847e-01 9.103 < 2e-16 ***
zipcode98002 1.909e+04 1.516e+04 1.259 0.207972
zipcode98003 -1.207e+04 1.341e+04 -0.900 0.368288
zipcode98004 7.176e+05 2.455e+04 29.229 < 2e-16 ***
zipcode98005 2.534e+05 2.626e+04 9.650 < 2e-16 ***
zipcode98006 2.205e+05 2.140e+04 10.305 < 2e-16 ***
zipcode98007 2.136e+05 2.691e+04 7.938 2.18e-15 ***
zipcode98008 2.371e+05 2.569e+04 9.230 < 2e-16 ***
zipcode98010 1.108e+05 2.280e+04 4.860 1.18e-06 ***
zipcode98011 5.783e+04 3.358e+04 1.722 0.085099 .
zipcode98014 1.005e+05 3.709e+04 2.711 0.006722 **
zipcode98019 7.545e+04 3.613e+04 2.088 0.036804 *
zipcode98022 6.976e+04 2.010e+04 3.470 0.000521 ***
zipcode98023 -5.426e+04 1.240e+04 -4.376 1.21e-05 ***
zipcode98024 1.806e+05 3.184e+04 5.672 1.43e-08 ***
zipcode98027 1.737e+05 2.203e+04 7.885 3.33e-15 ***
zipcode98028 3.778e+04 3.257e+04 1.160 0.246053
zipcode98029 2.355e+05 2.510e+04 9.383 < 2e-16 ***
zipcode98030 1.126e+04 1.465e+04 0.769 0.441829
zipcode98031 1.564e+04 1.537e+04 1.018 0.308915
zipcode98032 -1.284e+04 1.792e+04 -0.717 0.473687
zipcode98033 2.958e+05 2.789e+04 10.608 < 2e-16 ***
zipcode98034 1.264e+05 2.990e+04 4.228 2.37e-05 ***
zipcode98038 7.791e+04 1.655e+04 4.708 2.53e-06 ***
zipcode98039 1.101e+06 3.245e+04 33.942 < 2e-16 ***
zipcode98040 4.680e+05 2.194e+04 21.327 < 2e-16 ***
zipcode98042 2.810e+04 1.418e+04 1.982 0.047456 *
zipcode98045 1.814e+05 3.066e+04 5.917 3.33e-09 ***
zipcode98052 1.967e+05 2.842e+04 6.922 4.62e-12 ***
zipcode98053 1.808e+05 3.037e+04 5.954 2.67e-09 ***
zipcode98055 2.146e+04 1.718e+04 1.249 0.211728
zipcode98056 6.367e+04 1.868e+04 3.409 0.000653 ***
zipcode98058 3.359e+04 1.620e+04 2.073 0.038186 *
zipcode98059 6.952e+04 1.837e+04 3.784 0.000155 ***
zipcode98065 1.346e+05 2.815e+04 4.782 1.75e-06 ***
zipcode98070 -7.192e+04 2.176e+04 -3.305 0.000950 ***
zipcode98072 9.011e+04 3.321e+04 2.713 0.006671 **
zipcode98074 1.588e+05 2.686e+04 5.912 3.45e-09 ***
zipcode98075 1.598e+05 2.576e+04 6.204 5.64e-10 ***
zipcode98077 6.050e+04 3.468e+04 1.745 0.081073 .
zipcode98092 -1.196e+03 1.326e+04 -0.090 0.928155
zipcode98102 4.390e+05 2.935e+04 14.959 < 2e-16 ***
zipcode98103 2.424e+05 2.709e+04 8.948 < 2e-16 ***
zipcode98105 3.997e+05 2.773e+04 14.414 < 2e-16 ***
zipcode98106 5.587e+04 2.006e+04 2.785 0.005359 **
zipcode98107 2.487e+05 2.780e+04 8.948 < 2e-16 ***
zipcode98108 6.070e+04 2.237e+04 2.714 0.006652 **
zipcode98109 4.148e+05 2.861e+04 14.498 < 2e-16 ***
zipcode98112 5.380e+05 2.544e+04 21.151 < 2e-16 ***
zipcode98115 2.480e+05 2.744e+04 9.040 < 2e-16 ***
zipcode98116 2.080e+05 2.232e+04 9.320 < 2e-16 ***
zipcode98117 2.180e+05 2.782e+04 7.834 4.99e-15 ***
zipcode98118 1.124e+05 1.948e+04 5.771 8.02e-09 ***
zipcode98119 4.068e+05 2.701e+04 15.061 < 2e-16 ***
zipcode98122 2.721e+05 2.421e+04 11.235 < 2e-16 ***
zipcode98125 1.097e+05 2.968e+04 3.695 0.000220 ***
zipcode98126 1.206e+05 2.068e+04 5.831 5.61e-09 ***
zipcode98133 5.335e+04 3.061e+04 1.743 0.081338 .
zipcode98136 1.768e+05 2.100e+04 8.420 < 2e-16 ***
zipcode98144 2.204e+05 2.244e+04 9.823 < 2e-16 ***
zipcode98146 3.043e+04 1.876e+04 1.622 0.104828
zipcode98148 3.650e+04 2.635e+04 1.385 0.165989
zipcode98155 4.353e+04 3.191e+04 1.364 0.172629
zipcode98166 5.831e+03 1.725e+04 0.338 0.735412
zipcode98168 9.246e+03 1.825e+04 0.507 0.612501
zipcode98177 1.079e+05 3.190e+04 3.382 0.000720 ***
zipcode98178 -5.908e+03 1.861e+04 -0.317 0.750888
zipcode98188 -4.184e+03 1.898e+04 -0.220 0.825497
zipcode98198 -3.087e+04 1.448e+04 -2.132 0.033009 *
zipcode98199 2.724e+05 2.639e+04 10.323 < 2e-16 ***
lat 2.233e+05 6.632e+04 3.367 0.000761 ***
long -2.361e+05 4.727e+04 -4.995 5.95e-07 ***
sqft_living15 1.521e+01 3.029e+00 5.023 5.15e-07 ***
sqft_lot15 -1.159e-01 6.429e-02 -1.803 0.071344 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 150200 on 17284 degrees of freedom
Multiple R-squared: 0.8359, Adjusted R-squared: 0.835
F-statistic: 889.4 on 99 and 17284 DF, p-value: < 2.2e-16
anova(mod.all)
Analysis of Variance Table
Response: price
Df Sum Sq Mean Sq F value Pr(>F)
date 1 2.9956e+10 2.9956e+10 1.3282 0.2491485
bedrooms 1 2.3249e+14 2.3249e+14 10307.9306 < 2.2e-16 ***
bathrooms 1 4.2285e+14 4.2285e+14 18748.2537 < 2.2e-16 ***
sqft_living 1 5.5816e+14 5.5816e+14 24747.5215 < 2.2e-16 ***
sqft_lot 1 3.8694e+12 3.8694e+12 171.5587 < 2.2e-16 ***
floors 1 9.0483e+10 9.0483e+10 4.0118 0.0451986 *
waterfront 1 8.3044e+13 8.3044e+13 3681.9764 < 2.2e-16 ***
view 1 3.8892e+13 3.8892e+13 1724.3793 < 2.2e-16 ***
condition 4 1.6341e+13 4.0854e+12 181.1348 < 2.2e-16 ***
grade 11 1.7804e+14 1.6185e+13 717.6222 < 2.2e-16 ***
sqft_above 1 7.0222e+12 7.0222e+12 311.3450 < 2.2e-16 ***
yr_built 1 8.4946e+13 8.4946e+13 3766.2921 < 2.2e-16 ***
yr_renovated 1 4.7609e+11 4.7609e+11 21.1086 4.370e-06 ***
zipcode 69 3.5814e+14 5.1905e+12 230.1340 < 2.2e-16 ***
lat 1 3.1253e+11 3.1253e+11 13.8567 0.0001979 ***
long 1 6.0257e+11 6.0257e+11 26.7164 2.382e-07 ***
sqft_living15 1 5.4648e+11 5.4648e+11 24.2294 8.630e-07 ***
sqft_lot15 1 7.3351e+10 7.3351e+10 3.2522 0.0713445 .
Residuals 17284 3.8983e+14 2.2554e+10
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Your goal is to create a model with as small a test error as possible. Note that the square root of the training RSS from model mod.all is \(1.5018076\times 10^{5}\).
library(ggmap)
KingMap <-
get_map(
location = c(lon = -122.1, lat = 47.48),
zoom = 10,
source = "google",
maptype = "roadmap"
)
ggmap(KingMap) +
geom_point(
aes(x = housedata$long, y = housedata$lat),
data = housedata,
alpha = .2,
color = "blue",
size = 0.01
) +
geom_point(
aes(x = housedataT$long, y = housedataT$lat),
data = housedataT,
alpha = .2,
color = "red",
size = 0.01
) +
ggtitle("Houses Sold in King County, Wa (2014-2015)") +
labs(x = "longitute", y = "latitude")
Submit a vector named (Yourlastname_Yourfirstname) with the predicted house prices for your model using the data frame housedataT.
Suppose your final model is mod.all.
Yourlastname_Yourfirstname <- predict(mod.all, newdata = housedataT)
head(Yourlastname_Yourfirstname)
1 2 3 4 5 6
310196.9 845551.2 305946.0 532806.5 485256.6 469122.8
write.csv() to create the vector Yourlastname_Yourfirstname.csvwrite.csv(Yourlastname_Yourfirstname, file = "Yourlastname_Yourfirstname.csv")
I will compute your \(\sqrt{MSPE}\).
SMSPE <- sqrt(mean((head(Yourlastname_Yourfirstname) - c(310000, 650000, 233000, 580500, 535000, 605000))^2))
SMSPE
[1] 105493.4